{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# `vaex` @ EuroScipy Bilbao 2019\n",
    "\n",
    "## New York Taxi Dataset (2009-2015): Exploratory Data Analysis and Machine Learning example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:27.578191Z",
     "start_time": "2019-09-03T21:18:26.681111Z"
    }
   },
   "outputs": [],
   "source": [
    "import vaex"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read in the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:28.426812Z",
     "start_time": "2019-09-03T21:18:28.327533Z"
    }
   },
   "outputs": [],
   "source": [
    "# df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')\n",
    "df = vaex.open('s3://vaex/taxi/yellow_taxi_2009_2015_f32.hdf5')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Re-define the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:29.897518Z",
     "start_time": "2019-09-03T21:18:29.887322Z"
    }
   },
   "outputs": [],
   "source": [
    "# The test set\n",
    "df_test = df[1_026_944_937:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inspect the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:32.640879Z",
     "start_time": "2019-09-03T21:18:32.601529Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the _state_ into the test DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:43.141940Z",
     "start_time": "2019-09-03T21:18:42.296523Z"
    }
   },
   "outputs": [],
   "source": [
    "# This is where the magic happens\n",
    "df_test.state_load('./taxi_ml_state.json')\n",
    "# df_test.state_load('./copy-taxi_ml_state.json') # in case we mess up (which never happens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:51.504764Z",
     "start_time": "2019-09-03T21:18:48.338221Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Lets see what happened\n",
    "df_test.head(21)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:19:02.321950Z",
     "start_time": "2019-09-03T21:19:00.160268Z"
    }
   },
   "outputs": [],
   "source": [
    "# Check the performance\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
    "\n",
    "mae_test_score = mean_absolute_error(df_test[:1_000_000].trip_duration_min.values, \n",
    "                                     df_test[:1_000_000].lightgbm_prediction.values)\n",
    "mse_test_score = mean_squared_error(df_test[:1_000_000].trip_duration_min.values, \n",
    "                                    df_test[:1_000_000].lightgbm_prediction.values)\n",
    "\n",
    "print('The mean absolute error is %2.3f' % mae_test_score)\n",
    "print('The mean squared score is %2.3f' % mse_test_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:19:03.144637Z",
     "start_time": "2019-09-03T21:19:03.141498Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test.column_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:19:07.631600Z",
     "start_time": "2019-09-03T21:19:07.531450Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test.lightgbm_prediction._graphviz()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:19:25.596175Z",
     "start_time": "2019-09-03T21:19:25.546184Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test.state_get()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Thank you for your attention!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
